{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Udacity - Machine Learning Engineer Nanodegree\n",
    "## Capstone Project\n",
    "### Title: Development of a LSTM Network to Predict Students’ Answers on Exam Questions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Implementation of DKT:\n",
    "#### Part 1: Define constants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = \"data/ASSISTments_skill_builder_data.csv\" # Dataset path\n",
    "verbose = 1 # Verbose = {0,1,2}\n",
    "best_model_weights = \"weights/bestmodel\" # File to save the model.\n",
    "log_dir = \"logs\" # Path to save the logs.\n",
    "optimizer = \"adam\" # Optimizer to use\n",
    "lstm_units = 100 # Number of LSTM units\n",
    "batch_size = 32 # Batch size\n",
    "epochs = 10 # Number of epochs to train\n",
    "dropout_rate = 0.3 # Dropout rate\n",
    "test_fraction = 0.2 # Portion of data to be used for testing\n",
    "validation_fraction = 0.2 # Portion of training data to be used for validation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Part 2: Pre-processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============= Data Summary =============\n",
      "Total number of students: 4160\n",
      "Training set size: 2662\n",
      "Validation set size: 665\n",
      "Testing set size: 832\n",
      "Number of skills: 123\n",
      "Number of features in the input: 246\n",
      "========================================\n"
     ]
    }
   ],
   "source": [
    "from deepkt import deepkt, data_util, metrics\n",
    "\n",
    "\n",
    "dataset, length, nb_features, nb_skills = data_util.load_dataset(fn=fn,\n",
    "                                                                 batch_size=batch_size,\n",
    "                                                                 shuffle=True)\n",
    "\n",
    "train_set, test_set, val_set = data_util.split_dataset(dataset=dataset,\n",
    "                                                       total_size=length,\n",
    "                                                       test_fraction=test_fraction,\n",
    "                                                       val_fraction=validation_fraction)\n",
    "\n",
    "\n",
    "set_sz = length * batch_size\n",
    "test_set_sz = (set_sz * test_fraction)\n",
    "val_set_sz = (set_sz - test_set_sz) * validation_fraction\n",
    "train_set_sz = set_sz - test_set_sz - val_set_sz\n",
    "print(\"============= Data Summary =============\")\n",
    "print(\"Total number of students: %d\" % set_sz)\n",
    "print(\"Training set size: %d\" % train_set_sz)\n",
    "print(\"Validation set size: %d\" % val_set_sz)\n",
    "print(\"Testing set size: %d\" % test_set_sz)\n",
    "print(\"Number of skills: %d\" % nb_skills)\n",
    "print(\"Number of features in the input: %d\" % nb_features)\n",
    "print(\"========================================\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Part 3: Building the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"DKTModel\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "inputs (InputLayer)          [(None, None, 246)]       0         \n",
      "_________________________________________________________________\n",
      "masking (Masking)            (None, None, 246)         0         \n",
      "_________________________________________________________________\n",
      "lstm (LSTM)                  (None, None, 100)         138800    \n",
      "_________________________________________________________________\n",
      "outputs (TimeDistributed)    (None, None, 123)         12423     \n",
      "=================================================================\n",
      "Total params: 151,223\n",
      "Trainable params: 151,223\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "student_model = deepkt.DKTModel(\n",
    "                        nb_features=nb_features,\n",
    "                        nb_skills=nb_skills,\n",
    "                        hidden_units=lstm_units,\n",
    "                        dropout_rate=dropout_rate)\n",
    "\n",
    "student_model.compile(\n",
    "        optimizer=optimizer,\n",
    "        metrics=[\n",
    "            metrics.BinaryAccuracy(),\n",
    "            metrics.AUC(),\n",
    "            metrics.Precision(),\n",
    "            metrics.Recall()\n",
    "        ])\n",
    "\n",
    "student_model.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "#### Part 4: Train the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/10\n",
      "83/83 [==============================] - 12s 146ms/step - loss: 0.0657 - binary_accuracy: 0.7288 - auc: 0.7076 - precision: 0.7478 - recall: 0.9222 - val_loss: 0.0000e+00 - val_binary_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00\n",
      "Epoch 2/10\n",
      "83/83 [==============================] - 9s 108ms/step - loss: 0.0557 - binary_accuracy: 0.7720 - auc: 0.8120 - precision: 0.7765 - recall: 0.9440 - val_loss: 0.0666 - val_binary_accuracy: 0.7878 - val_auc: 0.8344 - val_precision: 0.7908 - val_recall: 0.9487\n",
      "Epoch 3/10\n",
      "83/83 [==============================] - 9s 108ms/step - loss: 0.0612 - binary_accuracy: 0.7788 - auc: 0.8233 - precision: 0.7862 - recall: 0.9348 - val_loss: 0.0607 - val_binary_accuracy: 0.7715 - val_auc: 0.8289 - val_precision: 0.7733 - val_recall: 0.9310\n",
      "Epoch 4/10\n",
      "83/83 [==============================] - 9s 103ms/step - loss: 0.0596 - binary_accuracy: 0.7877 - auc: 0.8315 - precision: 0.7991 - recall: 0.9319 - val_loss: 0.0618 - val_binary_accuracy: 0.7871 - val_auc: 0.8322 - val_precision: 0.7917 - val_recall: 0.9436\n",
      "Epoch 5/10\n",
      "83/83 [==============================] - 9s 109ms/step - loss: 0.0532 - binary_accuracy: 0.7817 - auc: 0.8291 - precision: 0.7958 - recall: 0.9199 - val_loss: 0.0518 - val_binary_accuracy: 0.8046 - val_auc: 0.8671 - val_precision: 0.8093 - val_recall: 0.9352\n",
      "Epoch 6/10\n",
      "83/83 [==============================] - 8s 98ms/step - loss: 0.0584 - binary_accuracy: 0.7768 - auc: 0.8201 - precision: 0.7889 - recall: 0.9232 - val_loss: 0.0522 - val_binary_accuracy: 0.7863 - val_auc: 0.8385 - val_precision: 0.8047 - val_recall: 0.9141\n",
      "Epoch 7/10\n",
      "83/83 [==============================] - 9s 104ms/step - loss: 0.0536 - binary_accuracy: 0.7839 - auc: 0.8295 - precision: 0.7970 - recall: 0.9238 - val_loss: 0.0603 - val_binary_accuracy: 0.7838 - val_auc: 0.8345 - val_precision: 0.7932 - val_recall: 0.9277\n",
      "Epoch 8/10\n",
      "83/83 [==============================] - 9s 107ms/step - loss: 0.0546 - binary_accuracy: 0.7958 - auc: 0.8498 - precision: 0.8080 - recall: 0.9248 - val_loss: 0.0607 - val_binary_accuracy: 0.8053 - val_auc: 0.8586 - val_precision: 0.8169 - val_recall: 0.9346\n",
      "Epoch 9/10\n",
      "83/83 [==============================] - 9s 106ms/step - loss: 0.0543 - binary_accuracy: 0.7795 - auc: 0.8309 - precision: 0.7928 - recall: 0.9147 - val_loss: 0.0509 - val_binary_accuracy: 0.7806 - val_auc: 0.8216 - val_precision: 0.7891 - val_recall: 0.9329\n",
      "Epoch 10/10\n",
      "83/83 [==============================] - 9s 105ms/step - loss: 0.0547 - binary_accuracy: 0.7950 - auc: 0.8464 - precision: 0.8063 - recall: 0.9248 - val_loss: 0.0603 - val_binary_accuracy: 0.7717 - val_auc: 0.8144 - val_precision: 0.7823 - val_recall: 0.9200\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "history = student_model.fit(dataset=train_set,\n",
    "                            epochs=epochs,\n",
    "                            verbose=verbose,\n",
    "                            validation_data=val_set,\n",
    "                            callbacks=[ \n",
    "                                tf.keras.callbacks.CSVLogger(f\"{log_dir}/train.log\"),\n",
    "                                tf.keras.callbacks.ModelCheckpoint(best_model_weights,\n",
    "                                                                   save_best_only=True,\n",
    "                                                                   save_weights_only=True),\n",
    "                                tf.keras.callbacks.TensorBoard(log_dir=log_dir)\n",
    "        ])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Part 5: Load the Model with the Best Validation Loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7eff96fd8cd0>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "student_model.load_weights(best_model_weights)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Part 6: Test the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     26/Unknown - 2s 67ms/step - loss: 0.0529 - binary_accuracy: 0.8071 - auc: 0.8599 - precision: 0.8176 - recall: 0.9329"
     ]
    }
   ],
   "source": [
    "result = student_model.evaluate(test_set, verbose=verbose)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
